I am a person looking for options to sell my products online, using platforms like Amazon. Right now I am not clear regarding what are the best products to invest in if I consider myself as a small investor (willing to invest up to 5 lakh Rupees) and so it is quite obvious I consider myself as a risk-averse investor.
I wish to perform sentiment analysis for product reviews from, say, Amazon, to select the best possible products for me.
Expected Answer:
Suggest Top five products for the investment bracket, with a detailed report using sentimental analysis on those product reviews.
import pandas as pd
import gzip
import matplotlib.pyplot as plt
import numpy as np
import itertools
import re, string, unicodedata
import nltk
from bs4 import BeautifulSoup
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud,STOPWORDS
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
import seaborn as sns
def parse(path):
g = gzip.open(path, 'rb')
for l in g:
yield eval(l)
def getDF(path):
i = 0
df = {}
for d in parse(path):
df[i] = d
i += 1
return pd.DataFrame.from_dict(df, orient='index')
review_df = getDF('reviews_Health_and_Personal_Care_5.json.gz')
metadata_df = getDF('meta_Health_and_Personal_Care.json.gz')
print ("Total data:", str(review_df.shape))
review_df.info()
review_df.head()
print ("Total metadata:", str(metadata_df.shape))
metadata_df.info()
metadata_df.head()
review_df.isnull().sum()
metadata_df.isnull().sum()
# MERGE
product_reviews_df=pd.merge(review_df,metadata_df,on='asin',how='left')
print ("Total products with reviews:", str(product_reviews_df.shape))
product_reviews_df.info()
product_reviews_df.head()
product_reviews_df.isnull().sum()
# DROP NULL VALUES IN PRODUCT TITLE COLUMN IN PANDAS
product_reviews_df=product_reviews_df.dropna(subset=['title'])
print ("Total products with reviews after removing null titles:", str(product_reviews_df.shape))
product_reviews_df.info()
product_reviews_df.head()
product_reviews_df.isnull().sum()
# FILLING NULL VALUES IN BRAND NAME WITH FIRST WORD FROM TITLE
product_reviews_df.loc[product_reviews_df.brand.isnull(),'brand'] = product_reviews_df.title.str.split().str.get(0)
product_reviews_df.isnull().sum()
# DROPPING IRRELAVENT COLUMNS
product_reviews_df=product_reviews_df.drop(columns=['salesRank','imUrl','related'],axis=1)
product_reviews_df.isnull().sum()
print ("Total products with reviews after removing null columns:", str(product_reviews_df.shape))
product_reviews_df.info()
product_reviews_df.head()
# APPEND REVIEWTEXT WITH SUMMARY
product_reviews_df['review_text'] = product_reviews_df[['summary', 'reviewText']].apply(lambda x: " ".join(str(y) for y in x if str(y) != 'nan'), axis = 1)
product_reviews_df = product_reviews_df.drop(['reviewText', 'summary'], axis = 1)
product_reviews_df.head()
product_reviews_df['overall'].value_counts()
# Apply the new classification to the ratings column
product_reviews_df['rating'] = product_reviews_df['overall'].apply(lambda x: 'bad' if x < 3 else'good')
#product_reviews_df = product_reviews_df.drop(['overall'], axis = 1)
product_reviews_df.head()
product_reviews_df['helpful_rate'] = product_reviews_df['helpful'].apply(lambda x: 0.0 if x[1] == 0 else x[0]/x[1])
product_reviews_df = product_reviews_df.drop(['helpful'], axis = 1)
product_reviews_df.head()
# checking duplicates based on'asin', 'reviewName' and 'unixReviewTime'columns
product_reviews_df[product_reviews_df.duplicated(['reviewerName','unixReviewTime','asin'],keep= False)]
#drop the duplicates
product_reviews_df = product_reviews_df.drop_duplicates(['asin','reviewerName', 'unixReviewTime'], keep = 'first')
product_reviews_df.head()
product_reviews_df.shape
# drop irrelavent columns
product_reviews_df = product_reviews_df.drop(['unixReviewTime', 'reviewerName'], axis = 1)
# formatting time
product_reviews_df['review_time'] = product_reviews_df.reviewTime.str.replace(',', "")
product_reviews_df['review_time'] = pd.to_datetime(product_reviews_df['review_time'], format = '%m %d %Y')
product_reviews_df = product_reviews_df.drop(['reviewTime'], axis = 1)
product_reviews_df.head()
total = len(product_reviews_df)
print ("Number of reviews: ",total)
print ("Number of unique reviewers: ",len(product_reviews_df.reviewerID.unique()))
print ("Number of unique products: ", len(product_reviews_df.asin.unique()))
print ("Average rating score: ",round(product_reviews_df.overall.mean(),3))
print ("Average helpfull ratio score: ",round(product_reviews_df.helpful_rate.mean(),3))
rating = product_reviews_df['rating'].value_counts()
sns.barplot(rating.index, rating.values)
plt.figure(figsize=(12,8))
product_reviews_df['overall'].value_counts().sort_index().plot(kind='bar')
plt.title('Distribution of Rating')
plt.xlabel('Rating')
plt.ylabel('Number of Reviews')
plt.figure(figsize=(12,8))
cutoff = np.array([0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1])
Dist_help = product_reviews_df.groupby([pd.cut(product_reviews_df['helpful_rate'], bins = cutoff)]).size()
Dist_help.plot(kind='bar')
plt.title("Distribution of Helpfulness")
plt.xlabel("Helpful proportion")
plt.ylabel("Number of Reviews")
plt.xticks(rotation=0)
plt.show()
sns.distplot(product_reviews_df['helpful_rate'], bins=20)
product_reviews_df.groupby('overall').size()
def strip_html(text):
soup = BeautifulSoup(text, "html.parser")
return soup.get_text()
def remove_between_square_brackets(text):
return re.sub('\[[^]]*\]', '', text)
def denoise_text(text):
text = strip_html(text)
text = remove_between_square_brackets(text)
return text
def remove_special_characters(text, remove_digits=True):
pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
text = re.sub(pattern, '', text)
return text
def remove_non_ascii(words):
new_words = []
for word in words:
new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
new_words.append(new_word)
return new_words
def to_lowercase(words):
new_words = []
for word in words:
new_word = word.lower()
new_words.append(new_word)
return new_words
def remove_punctuation_and_splchars(words):
new_words = []
for word in words:
new_word = re.sub(r'[^\w\s]', '', word)
if new_word != '':
new_word = remove_special_characters(new_word, True)
new_words.append(new_word)
return new_words
def replace_numbers(words):
p = inflect.engine()
new_words = []
for word in words:
if word.isdigit():
new_word = p.number_to_words(word)
new_words.append(new_word)
else:
new_words.append(word)
return new_words
stopword_list= stopwords.words('english')
stopword_list.remove('no')
stopword_list.remove('not')
def remove_stopwords(words):
new_words = []
for word in words:
if word not in stopword_list:
new_words.append(word)
return new_words
def stem_words(words):
stemmer = LancasterStemmer()
stems = []
for word in words:
stem = stemmer.stem(word)
stems.append(stem)
return stems
def lemmatize_verbs(words):
lemmatizer = WordNetLemmatizer()
lemmas = []
for word in words:
lemma = lemmatizer.lemmatize(word, pos='v')
lemmas.append(lemma)
return lemmas
def normalize(words):
words = remove_non_ascii(words)
words = to_lowercase(words)
words = remove_punctuation_and_splchars(words)
words = remove_stopwords(words)
return words
def lemmatize(words):
lemmas = lemmatize_verbs(words)
return lemmas
def normalize_and_lemmaize(input):
sample = denoise_text(input)
sample = remove_special_characters(sample)
words = nltk.word_tokenize(sample)
words = normalize(words)
lemmas = lemmatize(words)
return ' '.join(lemmas)
product_reviews_df['clean_text'] = product_reviews_df['review_text'].map(lambda text: normalize_and_lemmaize(text))
product_reviews_df.head()
good_reviews = product_reviews_df[product_reviews_df['rating'] == 'good'].clean_text
bad_reviews = product_reviews_df[product_reviews_df['rating'] == 'bad'].clean_text
def plot_word_cloud(text):
wordcloud = WordCloud(
width = 3000,
height = 2000,
background_color = 'black',
stopwords = STOPWORDS).generate(str(text))
fig = plt.figure(
figsize = (40, 30),
facecolor = 'k',
edgecolor = 'k')
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()
plot_word_cloud(good_reviews)
plot_word_cloud(bad_reviews)
product_reviews_df['rating_class'] = product_reviews_df['rating'].apply(lambda x: 0 if x == 'bad' else 1)
X = product_reviews_df['clean_text']
y = product_reviews_df['rating_class']
tfidf_vectorizer = TfidfVectorizer(max_features=5000,ngram_range=(2,2))
# TF-IDF feature matrix
X= tfidf_vectorizer.fit_transform(X)
# Splitting Dataset into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
def modeling(Model, Xtrain = X_train, Xtest = X_test):
# Instantiate the classifier: model
model = Model
# Fitting classifier to the Training set (all features)
model.fit(Xtrain, y_train)
global y_pred
# Predicting the Test set results
y_pred = model.predict(Xtest)
# Assign f1 score to a variable
score = f1_score(y_test, y_pred, average = 'weighted')
# Printing evaluation metric (f1-score)
print("f1 score: {}".format(score))
def display_cm(cm):
plt.style.use('default')
cm = pd.DataFrame(cm , index = ['BAD','GOOD'] , columns = ['BAD','GOOD'])
sns.heatmap(cm,cmap= "Greens", linecolor = 'black' , linewidth = 2 , annot = True, fmt='' , xticklabels = ['BAD','GOOD'] , yticklabels = ['BAD','GOOD'])
modeling(LogisticRegression(multi_class = 'multinomial', solver = 'newton-cg',
class_weight = 'balanced', C = 0.1, n_jobs = -1, random_state = 42))
# Assign y_pred to a variable for further process
y_pred_cv_logreg = y_pred
print(classification_report(y_test, y_pred_cv_logreg))
display_cm(confusion_matrix(y_test, y_pred_cv_logreg))
modeling(RandomForestClassifier(n_estimators = 200, random_state = 42))
y_pred_cv_rf = y_pred
print(classification_report(y_test, y_pred_cv_rf))
display_cm(confusion_matrix(y_test, y_pred_cv_rf))
modeling(DecisionTreeClassifier())
y_pred_dt= y_pred
print(classification_report(y_test, y_pred_dt))
display_cm(confusion_matrix(y_test, y_pred_dt))
modeling(BernoulliNB())
y_pred_nb= y_pred
print(classification_report(y_test, y_pred_nb))
display_cm(confusion_matrix(y_test, y_pred_nb))
modeling(KNeighborsClassifier())
y_pred_knn= y_pred
print(classification_report(y_test, y_pred_knn))
display_cm(confusion_matrix(y_test, y_pred_knn))
params = {'alpha': [0.01, 0.1, 0.5, 1.0, 10.0],
}
bernoulli_nb_grid = GridSearchCV(BernoulliNB(), param_grid=params, n_jobs=-1, cv=5, verbose=5)
bernoulli_nb_grid.fit(X,y)
print('Train Accuracy : %.3f'%bernoulli_nb_grid.best_estimator_.score(X_train, y_train))
print('Test Accuracy : %.3f'%bernoulli_nb_grid.best_estimator_.score(X_test, y_test))
print('Best Accuracy Through Grid Search : %.3f'%bernoulli_nb_grid.best_score_)
print('Best Parameters : ',bernoulli_nb_grid.best_params_)
# creating overall helpful rate feature
product_reviews_df['overall_helpful_rate'] = product_reviews_df['overall']* product_reviews_df['helpful_rate']
# getting prodcut wise sum of overall helpful rate feature
good_ratings_sum = pd.DataFrame(product_reviews_df[product_reviews_df['rating'] == 'good']
.groupby(['asin'])['overall_helpful_rate']
.sum())
# sort in non-ascending order to get top 5 only
top_5=good_ratings_sum.sort_values('overall_helpful_rate', ascending = False).head(5)
# merge to get other features of the products
top_5_popular=top_5.merge(product_reviews_df,left_index = True, right_on = 'asin').drop_duplicates(
['asin', 'title'])[['asin', 'title','price','categories']]
top_5_popular_with_img = pd.DataFrame(metadata_df.loc[metadata_df['asin'].isin(top_5_popular['asin'])])
top_5_popular_with_img = top_5_popular_with_img.drop(['related'], axis = 1)
from IPython.display import HTML
def path_to_image(path):
return '<img src="'+ path + '" style=max-height:124px;"/>'
HTML(top_5_popular_with_img[['asin','imUrl','title']].to_html(escape=False , formatters={'imUrl': path_to_image}))
top_5_product_reviews_df = pd.DataFrame(product_reviews_df.loc[product_reviews_df['asin'].isin(top_5_popular_with_img['asin'])])
for asin in top_5_popular_with_img['asin']:
plot_word_cloud(pd.DataFrame(product_reviews_df.loc[product_reviews_df['asin'] == asin]))
sns.countplot(top_5_product_reviews_df.title)
plt.xticks(rotation=90)
sns.distplot(top_5_product_reviews_df['overall'], bins=20)
sns.countplot(top_5_product_reviews_df['title'], hue=top_5_product_reviews_df['rating'])
plt.xticks(rotation=90)
By looking at the reviews count, rating distibution and wordcloud for the top selling products on amazon in given category we can suggest below to our investor,
- "Tide Original Scent Liquid Laundry Detergent , 50 Fl Oz, 2 Count" is most widely used product with great positive reviews and rating while "Tide Pods Laundry Detergent Alpine Breeze Scent 18 Count" is the second highest in this range.
- "Jarrow Formulas Methylcobalamin (Methyl B12), 5000mcg, 60 Lozenges" is widely used for B12 followed by "Jarrow Formulas Magmind Nutritional Supplement, 90 Count"
- "Dropps Laundry Detergent Pacs, Fresh Scent, 20 Loads (Pack of 3)" comes last in top 5 products to invest recommendation.
- Most of our bad reviews were actual critic of product from the buyers, so we can consider these as feedback and can use to improve the product further to get more profit.
- Liquid form of detergent is more popular than powder form, we can use this to diversify our investment into detergents. Also need to pay more attentions towards skin sensitivity along with smell, cleanliness, packsize, color, scent etc. properties